Suggested citation: Madaga, Lavinia, Jordan Chamberlin, Bisrat Gebrekidan, Robert Hijmans, Nicholaus Kuboja, and Maxwell Mkondiwa. 2024. “Predictive mapping of wholesale grain prices for rural areas in Tanzania.” https://github.com/EiA2030-ex-ante/Tanzania-Price-Data
We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, using monthly data from across Tanzania over the period May 2021-July 2024.
This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.
library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(gdistance)
## Warning: package 'gdistance' was built under R version 4.3.3
## Warning: package 'igraph' was built under R version 4.3.3
library(sp)
library(raster)
library(foreign)
library(gdistance)
This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.
setwd("H:/Tanzania Price data/Datasets")
prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
## [1] 8961 21
head(prices)
## Region Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1: Arusha Arusha 43000 45000 140000
## 2: Dar es Salaam Temeke 46000 47000 120000
## 3: Dodoma Majengo 45000 50000 132000
## 4: Dodoma Kibaigwa 30000 42000 NA
## 5: Kagera Bukoba 33000 35000 110000
## 6: Manyara Babati 30000 45000 120000
## Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1: 170000 60000 65000
## 2: 210000 80000 100000
## 3: 200000 50000 60000
## 4: NA 45000 48000
## 5: 140000 140000 150000
## 6: 170000 60000 80000
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1: 70000 75000
## 2: 80000 100000
## 3: 52000 64000
## 4: NA NA
## 5: 120000 140000
## 6: 80000 100000
## Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1: 120000 125000 70000
## 2: 175000 180000 110000
## 3: 200000 252000 NA
## 4: NA NA NA
## 5: 170000 180000 170000
## 6: 120000 130000 100000
## Wheat..max.price. Beans..min.price. Beans..max.price.
## 1: 78000 130000 165000
## 2: 120000 220000 260000
## 3: NA 200000 245000
## 4: NA NA NA
## 5: 180000 110000 170000
## 6: 120000 150000 180000
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. Date Latitude
## 1: 70000 75000 5/5/2021 -3.36968
## 2: 75000 75000 5/5/2021 -6.85097
## 3: 56000 68000 5/5/2021 -6.17971
## 4: NA NA 5/5/2021 -6.08110
## 5: 60000 75000 5/5/2021 -1.33140
## 6: 60000 60000 5/5/2021 -4.21006
## Longitude
## 1: 36.68808
## 2: 39.25672
## 3: 35.74109
## 4: 36.64645
## 5: 31.81293
## 6: 35.74915
table(prices$Market)
##
## Arusha Babati Bariadi Buguruni Bukoba Igawilo
## 374 278 113 10 424 73
## Ilala Iringa Kibaigwa Kigoma Kilombero Kinondoni
## 238 427 211 282 73 203
## Lindi majengo Majengo Manyara Mbeya Mgandini
## 281 61 447 116 117 73
## Mnalani Morogoro Moshi Mpanda Mpimbwe Mtwara
## 24 421 287 195 39 435
## Musoma Mwananyamala Mwanza Namfua Njombe Nyankumbu
## 322 73 384 73 174 111
## Pwani Shinyanga Singida Songea Songwe Sumbawanga
## 89 317 83 396 83 420
## Tabora Tandale Tandika Tanga Temeke Ubungo
## 399 90 99 361 204 81
sapply(prices, class)
## Region Market
## "character" "character"
## Maize..min.price. Maize..max.price.
## "integer" "integer"
## Rice..min.price. Rice..max.price.
## "integer" "integer"
## Sorghum..min.price. Sorghum..max.price.
## "integer" "integer"
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## "integer" "integer"
## Finger.Millet..min.price. Finger.Millet..max.price.
## "integer" "integer"
## Wheat..min.price. Wheat..max.price.
## "integer" "integer"
## Beans..min.price. Beans..max.price.
## "integer" "integer"
## Irish.Potatoes..min.price. Irish.Potatoes..max.price.
## "integer" "integer"
## Date Latitude
## "character" "numeric"
## Longitude
## "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)
Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded
unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Arusha -3.36968 36.68808
## 2: Kilombero -3.37324 36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Temeke -6.850970 39.25672
## 2: Kinondoni -6.784070 39.27007
## 3: Ilala -6.829410 39.26289
## 4: Tandika -6.867912 39.25480
## 5: Buguruni -6.838380 39.24359
## 6: Tandale -6.795230 39.24085
## 7: Ubungo -6.793620 39.20966
## 8: Mwananyamala -6.788890 39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Majengo -6.17971 35.74109
## 2: Kibaigwa -6.08110 36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bukoba -1.3314 31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Babati -4.21006 35.74915
## 2: Manyara -4.46011 37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Sumbawanga -7.95239 31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mtwara -10.28009 40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tabora -5.01972 32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tanga -5.074260 39.09993
## 2: Mgandini -5.086235 39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Iringa -7.78001 35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Kigoma -4.89697 29.65987
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Morogoro -6.82771 37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mwanza -2.51969 32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Musoma -1.49982 33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songea -10.67873 35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Shinyanga -3.67226 33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Moshi -3.34865 37.34352
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mbeya -8.909940 33.45517
## 2: Igawilo -8.924246 33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mpanda -6.34295 31.07299
## 2: Mpimbwe -7.24425 31.81782
## 3: majengo -6.34809 31.07055
## 4: Majengo -6.34809 31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Njombe -9.33805 34.76949
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Lindi -9.995 39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Singida -4.812610 34.7428
## 2: Namfua -4.821121 34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Pwani -6.44221 38.90622
## 2: Mnalani -6.44221 38.90622
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bariadi -2.80355 33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Nyankumbu -2.870760 32.23408
## 2: Nyankumbu -2.895955 32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songwe -8.95179 33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "integer" "integer" "integer"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "integer" "integer" "integer" "integer" "integer"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "integer" "integer" "integer" "integer" "integer"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "integer" "integer" "integer" "Date" "numeric"
## Longitude
## "numeric"
#convert prices to numeric
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)
prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "numeric" "numeric" "numeric"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "numeric" "numeric" "numeric" "numeric" "numeric"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "numeric" "numeric" "numeric" "numeric" "numeric"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "numeric" "numeric" "numeric" "Date" "numeric"
## Longitude
## "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100
prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
"ric.price.min", "ric.price.max",
"sor.price.min", "sor.price.max",
"bul.price.min", "bul.price.max",
"fin.price.min", "fin.price.max",
"whe.price.min", "whe.price.max",
"bea.price.min", "bea.price.max",
"pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE),
ric.price = mean(ric.price, na.rm = TRUE),
sor.price = mean(sor.price, na.rm = TRUE),
bul.price = mean(bul.price, na.rm = TRUE),
fin.price = mean(fin.price, na.rm = TRUE),
whe.price = mean(whe.price, na.rm = TRUE),
bea.price = mean(bea.price, na.rm = TRUE),
pot.price = mean(pot.price, na.rm = TRUE)),
by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly
## Region Market Month Year Latitude Longitude mai.price ric.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.00 1530.000
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.00 1650.000
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.50 1592.000
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.00 NaN
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.25 1260.000
## ---
## 950: Mwanza Mwanza 10 2024 -2.51969 32.90144 735.00 2016.667
## 951: Pwani Mnalani 10 2024 -6.44221 38.90622 925.00 1925.000
## 952: Simiyu Bariadi 10 2024 -2.80355 33.98699 667.50 1466.667
## 953: Kigoma Kigoma 10 2024 -4.89697 29.65987 667.00 1750.000
## 954: Rukwa Sumbawanga 10 2024 -7.95239 31.62319 651.25 1916.667
## sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 695.000 761.000 1333.000 793.0 1545.000 745.0000
## 2: 900.000 900.000 1775.000 1230.0 2420.000 730.0000
## 3: 558.000 581.000 1752.000 NaN 2075.000 618.0000
## 4: 437.500 NaN NaN NaN NaN NaN
## 5: 1375.000 1337.500 1637.500 1637.5 1300.000 675.0000
## ---
## 950: 1416.667 1283.333 1583.333 NaN 2800.000 991.6667
## 951: 1525.000 1800.000 2016.667 2000.0 2920.833 1500.0000
## 952: 1420.833 1825.000 1825.000 2500.0 2845.833 1450.0000
## 953: 1000.000 1500.000 2000.000 1700.0 2025.000 900.0000
## 954: NaN NaN 975.000 725.0 2437.500 670.8333
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1 , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2 , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3 , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4 , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5 , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6 , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7 , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8 , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9 , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
## [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
## [2] "travel/travel_time_to_cities_u3.tif"
## [3] "travel/travel_time_to_cities_u5.tif"
## [4] "TRUE/gadm/gadm41_TZA_0_pk.rds"
## [5] "TRUE/gadm/gadm41_TZA_1_pk.rds"
## [6] "TRUE/gadm/gadm41_TZA_2_pk.rds"
## [7] "TRUE/gadm/gadm41_TZA_3_pk.rds"
## [8] "TRUE/spam/spam2017v2r1_ssa_area.zip"
## [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"
## [14] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"
## [20] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"
## [21] "TRUE/spam/spam2017v2r1_ssa_yield.zip"
## [22] "TRUE/travel/travel_time_to_cities_u5.tif"
## [23] "TRUE/travel/travel_time_to_ports_1.tif"
## [24] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)
tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)
# text(mypts, label="Market")
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)
## Interpolate
#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)
## Predict Maize prices with coodinates only
maize_mypts <- mypts[mypts$Crop == "Maize", ]
rf <- randomForest(pkg ~ Longitude + Latitude , data=maize_mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)
The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.
ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield) <- c("MAI_YLD") # SPAM maize yield 2010
names(popd) <- c("popdens") # GPW4
comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"
comment(popd) <- "population density 2020 (GPW4 @ 10dm)"
comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"
ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm <- resample(clm, r)
area <- resample(area, r)
popd <- resample(popd, r)
freq(is.na(area))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
area <- classify(area, cbind(NA,0))
yield <- resample(yield, r)
freq(is.na(yield))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
yield <- classify(yield, cbind(NA,0))
# check again
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE
latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")
rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
chirps_path <- "H:/Tanzania Price data/chirps_data"
chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)
# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)
#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)
writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)
Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class : SpatRaster
## dimensions : 215, 222, 48 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source : Tz_chirps_monthly_croped.tif
## names : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ...
## min values : -9999.0000, -9999.0000, -9999.000, -9999.0000, -9999.0000, -9999.0000, ...
## max values : 459.7112, 504.3329, 508.448, 585.3241, 750.1949, 960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))
#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
## [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
## [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
## [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" "chirps-v2.0.2024.07"
## [46] "chirps-v2.0.2024.08" "chirps-v2.0.2024.09" "chirps-v2.0.2024.10"
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
## [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
## [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" "2024-07-01"
## [46] "2024-08-01" "2024-09-01" "2024-10-01"
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates
#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates
# Check the SpatRaster object
print(Tz_chirps_monthly)
## class : SpatRaster
## dimensions : 215, 222, 48 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 5.859746, 1.903993, 0.8161677, 0.3187093, 0.9539621, 1.17571, ...
## max values : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463, 960.33765, ...
## time (days) : 2020-11-01 to 2024-10-01
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class : SpatRaster
## dimensions : 215, 222, 48 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.79649, 27.27837, 8.092316, 4.489223, 11.30184, 15.74836, ...
## max values : 313.14248, 326.88227, 416.590468, 395.422070, 464.67840, 569.81990, ...
## time (days) : 2020-11-01 to 2024-10-01
Rainfall <- Rainfall_focal_sum_100km
#Resample
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class : SpatRaster
## dimensions : 144, 144, 48 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.89972, 32.9472, 8.116516, 4.527868, 11.83634, 15.90436, ...
## max values : 310.41034, 326.4339, 416.389160, 395.367279, 462.19266, 550.26270, ...
## time (days) : 2020-11-01 to 2024-10-01
Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.
calculate_lagged_sum <- function(raster_stack, num_months = 6) {
# Get the time vector from the raster stack
time_vector <- time(raster_stack)
# Initialize list to store lagged sum rasters
lagged_sum_rasters <- vector("list", length(time_vector))
# Loop through each layer in the raster stack
for (i in seq_along(time_vector)) {
if (i > num_months) { # We need at least 'num_months' previous layers to calculate the lagged sum
# Determine the start and end dates for the lag period
end_date <- time_vector[i] # Date of the current layer being processed
start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
# Select the layers that fall within the lag period
lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
# Calculate the sum of the selected layers
if (nlyr(lag_period_layers) == num_months) {
lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
}
# Combine the lagged sum rasters into a single raster stack, excluding empty rasters
lagged_sum_stack <- rast(lagged_sum_rasters)
# Set the names for the layers in the lagged sum stack
names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
return(lagged_sum_stack)
}
# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)
# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class : SpatRaster
## dimensions : 144, 144, 42 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ...
## min values : 172.7856, 105.5447, 105.4121, 105.1864, 26.20571, 7.070483, ...
## max values : 1512.4689, 1355.9669, 1052.9220, 1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")
plot(lagged_rainfall_sum_filtered)
## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class : SpatRaster
## dimensions : 144, 144, 26 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : ttcity_u5, ttport_1, bio_1, bio_2, bio_3, bio_4, ...
## min values : 3.350081e-02, 997.9427, 4.333545, 6.546645, 53.85881, 19.75255, ...
## max values : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
#names(rstack)
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
## [1] "ttcity_u5" "ttport_1"
## [3] "bio_1" "bio_2"
## [5] "bio_3" "bio_4"
## [7] "bio_5" "bio_6"
## [9] "bio_7" "bio_8"
## [11] "bio_9" "bio_10"
## [13] "bio_11" "bio_12"
## [15] "bio_13" "bio_14"
## [17] "bio_15" "bio_16"
## [19] "bio_17" "bio_18"
## [21] "bio_19" "MAI_ARE"
## [23] "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
## [65] "2024-07-01_rain.sum.lag" "2024-08-01_rain.sum.lag"
## [67] "2024-09-01_rain.sum.lag" "2024-10-01_rain.sum.lag"
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
Here we extract sum of lag rainfall for each row under the column rain.sum.lag
mypts_df <- as.data.frame(mypts)
# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
# Extract the rainfall value for the current date
rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
return(rain_sum)
}
# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
# Extract relevant data for the current row
month <- mypts_df$Month[i]
year <- mypts_df$Year[i]
current_date <- paste0(year, "-", sprintf("%02d", month), "-01") # Format date correctly
# Pass necessary data to the function
rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
# Update the rain.sum.lag column
mypts_df$rain.sum.lag[i] <- rain_sum
}
# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
## [1] "Region" "Market" "Month" "Year" "Latitude"
## [6] "Longitude" "Crop" "pkg" "maize" "rice"
## [11] "sorghum" "bmillet" "fmillet" "wheat" "beans"
## [16] "potato" "jan" "feb" "mar" "apr"
## [21] "may" "jun" "jul" "aug" "sep"
## [26] "oct" "nov" "dec" "ttcity_u5" "ttport_1"
## [31] "bio_1" "bio_2" "bio_3" "bio_4" "bio_5"
## [36] "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [41] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15"
## [46] "bio_16" "bio_17" "bio_18" "bio_19" "MAI_ARE"
## [51] "MAI_YLD" "popdens" "latitude" "longitude" "rain.sum.lag"
#define Month as a factor
#mypts$Month <- as.factor(mypts$Month)
#levels(mypts$Month)
#We'll define month as an interger instead.
# Check to make sure Month is interger
sapply(mypts, class)
## Region Market Month Year Latitude Longitude
## "character" "character" "integer" "integer" "numeric" "numeric"
## Crop pkg maize rice sorghum bmillet
## "factor" "numeric" "numeric" "numeric" "numeric" "numeric"
## fmillet wheat beans potato jan feb
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## mar apr may jun jul aug
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## sep oct nov dec ttcity_u5 ttport_1
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_19 MAI_ARE MAI_YLD popdens latitude longitude
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## rain.sum.lag
## "numeric"
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize" "Rice" "Sorghum" "B.Millet" "F.Millet" "Wheat" "Beans"
## [8] "Potato"
Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.
# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13
+ bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = mypts)
summary(lm_model)
##
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet +
## wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 +
## bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 +
## bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 +
## bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude +
## Latitude + rain.sum.lag, data = mypts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1371.18 -256.31 -15.15 239.45 2295.70
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.352e+05 1.045e+04 -32.070 < 2e-16 ***
## maize -9.316e+01 1.803e+01 -5.166 2.46e-07 ***
## rice 1.344e+03 1.802e+01 74.603 < 2e-16 ***
## sorghum 3.805e+02 1.901e+01 20.015 < 2e-16 ***
## bmillet 4.743e+02 2.028e+01 23.388 < 2e-16 ***
## fmillet 7.742e+02 1.874e+01 41.313 < 2e-16 ***
## wheat 9.343e+02 2.051e+01 45.554 < 2e-16 ***
## beans 1.529e+03 1.804e+01 84.753 < 2e-16 ***
## potato NA NA NA NA
## Month -6.529e-01 1.897e+00 -0.344 0.730686
## Year 1.586e+02 4.894e+00 32.417 < 2e-16 ***
## ttcity_u5 1.832e+00 1.565e-01 11.704 < 2e-16 ***
## ttport_1 -1.798e+00 1.700e-01 -10.575 < 2e-16 ***
## bio_1 -2.311e+03 1.879e+02 -12.298 < 2e-16 ***
## bio_2 -2.291e+03 2.166e+02 -10.580 < 2e-16 ***
## bio_3 2.032e+02 1.813e+01 11.213 < 2e-16 ***
## bio_4 1.540e+01 3.835e+00 4.015 6.01e-05 ***
## bio_5 1.719e+03 1.682e+02 10.216 < 2e-16 ***
## bio_6 -1.637e+03 1.751e+02 -9.351 < 2e-16 ***
## bio_7 NA NA NA NA
## bio_8 8.409e+02 9.086e+01 9.255 < 2e-16 ***
## bio_9 -7.335e+01 7.384e+01 -0.993 0.320559
## bio_10 -8.061e+02 1.974e+02 -4.084 4.48e-05 ***
## bio_11 2.300e+03 2.842e+02 8.094 6.83e-16 ***
## bio_12 1.090e+00 2.647e-01 4.116 3.90e-05 ***
## bio_13 7.968e-01 8.857e-01 0.900 0.368304
## bio_14 4.350e+01 1.147e+01 3.792 0.000151 ***
## bio_15 1.844e+01 2.021e+00 9.128 < 2e-16 ***
## bio_16 -1.522e+00 6.773e-01 -2.247 0.024658 *
## bio_17 -1.242e+01 3.270e+00 -3.799 0.000147 ***
## bio_18 4.681e-02 2.382e-01 0.197 0.844221
## bio_19 2.942e+00 2.570e+00 1.145 0.252402
## MAI_ARE 1.628e-02 2.852e-02 0.571 0.568054
## MAI_YLD 1.095e-01 1.527e-02 7.173 8.12e-13 ***
## Longitude 5.937e+01 3.237e+01 1.834 0.066682 .
## Latitude -2.043e+01 3.163e+01 -0.646 0.518352
## rain.sum.lag -1.318e-01 1.865e-02 -7.065 1.77e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 389.5 on 6561 degrees of freedom
## Multiple R-squared: 0.7194, Adjusted R-squared: 0.718
## F-statistic: 494.8 on 34 and 6561 DF, p-value: < 2.2e-16
First check if there are any predictors with NA values
for(column in seq_along(mypts)){
if(any(is.na(mypts[column]))){
print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
}
}
#There are no columns with missing values
Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.
mypts <- as.data.frame(mypts)
# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)
The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.
# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)
trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
y=mypts_df$pkg) # Response variable
## mtry = 18 OOB error = 1407.873
## Searching left ...
## mtry = 9 OOB error = 7242.192
## -4.144066 0.05
## Searching right ...
## mtry = 36 OOB error = 162.1689
## 0.8848129 0.05
## mtry = 55 OOB error = 98.42907
## 0.3930459 0.05
Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.
(mintree <- trf[which.min(trf[,2]),1])
## [1] 55
Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.
# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 +
bio_13 + bio_14 + bio_15 + bio_16 + bio_17 +
bio_18 + bio_19 + MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = training_data,mtry=mintree,
importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data, mtry = mintree, importance = TRUE, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 36
##
## Mean of squared residuals: 24140.83
## % Var explained: 95.37
varImpPlot(rf1)
## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 154.9753
This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.
importance_metrics <- importance(rf1, type=1) # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
## [1] "Year" "Month" "beans" "rice" "sorghum"
## [6] "Longitude" "rain.sum.lag" "bmillet" "wheat" "fmillet"
## [11] "bio_12" "bio_18" "MAI_YLD" "bio_15" "MAI_ARE"
## [16] "Latitude" "bio_3" "ttcity_u5" "ttport_1" "bio_4"
node_purity <- importance(rf1, type=2) # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
## [1] "rice" "beans" "Year" "wheat" "fmillet"
## [6] "Month" "potato" "bio_12" "maize" "Longitude"
## [11] "rain.sum.lag" "sorghum" "bmillet" "bio_3" "bio_7"
## [16] "MAI_YLD" "bio_18" "ttcity_u5" "MAI_ARE" "Latitude"
rf1$importanceSD
## maize rice sorghum bmillet fmillet wheat
## 1955.5603 2879.1917 452.8803 612.4819 1711.3199 1616.8257
## beans potato Month Year ttcity_u5 ttport_1
## 3061.0326 1929.0591 258.5696 548.4348 263.7380 254.9017
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## 139.0556 278.9766 593.3162 168.4609 197.7081 307.8925
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 749.3357 117.0002 272.9865 155.8555 157.6274 821.6118
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 153.2510 205.3050 266.3121 149.6587 157.4320 251.5292
## bio_19 MAI_ARE MAI_YLD Longitude Latitude rain.sum.lag
## 193.6378 258.2230 417.8781 366.7401 307.8064 191.8319
In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.
# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttport_1 +
popdens +
bio_3 + bio_6 + bio_9 + bio_12 +
rain.sum.lag,
data=training_data, na.rm=TRUE)
rf
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + rain.sum.lag, data = training_data, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 30062.28
## % Var explained: 94.24
# evaluate
varImpPlot(rf)
(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 173.3164
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")
These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.
year <- 2024
Note: we must set the rain.sum.lag variables for each month we are predicting
#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_maize <- data.frame(
maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_maize <- lapply(1:10, predict_for_month)
# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0)) # Set margins to 0 for inner plots
for (i in 1:10) {
plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)
# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Maize Price (TZS Per Kg)", side = 1, line = 2, cex = 0.9))
# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_beans <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_beans <- lapply(1:10, predict_for_beans)
# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year),
zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Beans Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_rice <- data.frame(
maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_rice <- lapply(1:10, predict_for_rice)
# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year),
zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Rice Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_sorghum <- data.frame(
maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = 2023
)
pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_sorghum <- lapply(1:10, predict_for_sorghum)
# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 300
# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year),
zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Sorghum Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_bmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_bmillet <- lapply(1:10, predict_for_bmillet)
# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot bmillet prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year),
zlim = c(min_bmillet, max_bmillet), col = color_palette,
breaks = seq(min_bmillet, max_bmillet, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Bulrush Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_fmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_fmillet <- lapply(1:10, predict_for_fmillet)
# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot fmillet prices
break_interval <- 300
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year),
zlim = c(min_fmillet, max_fmillet), col = color_palette,
breaks = seq(min_fmillet, max_fmillet, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Finger Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_wheat <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_wheat <- lapply(1:10, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100
# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year),
zlim = c(min_wheat, max_wheat), col = color_palette,
breaks = seq(min_wheat, max_wheat, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Wheat Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_potato <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
Month = month,
Year = year
)
pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_potato <- lapply(1:10, predict_for_potato)
# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- rev(terrain.colors(100))
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:10) {
plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year),
zlim = c(min_potato, max_potato), col = color_palette,
breaks = seq(min_potato, max_potato, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5) # Adjust n as needed
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Potato Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
## [1] "Mean Squared Error: 156604.920494802"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error: 177.66089985911"
#Save predicted & observed price
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X" "actual" "predicted"
#RMSE predicting from rf - predicited vs observed
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
## [1] 395.73
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,2)
print(rf.r2)
## [1] 0.75
range(actual)
## [1] 350 4050
range(pred)
## [1] 615.6416 3266.9242
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh) Per Kg",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
scale_y_continuous("Predicted Price (Tsh) Per Kg",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
geom_point(colour = "blue", size = 1.4 ,alpha=0.6) +
ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
geom_abline(slope = 1, alpha=0.3) +
annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3) +
annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3) +
labs(x = "Observed Price (Tsh) Per Kg", y = "Predicted Price (Tsh) Per Kg") +
xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot
Plot Partial dependence of all the variables used except food commodities and months.
library(caret)
var_importance <- varImp(rf)
impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
for (i in seq_along(predictors_to_plot)) {
partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
main="Partial Dependence")
}
# Compute descriptive statistics for each crop
stats <- mypts %>%
filter(pkg > 0) %>%
group_by(Crop) %>%
summarise(
Mean = mean(pkg, na.rm = TRUE),
Median = median(pkg, na.rm = TRUE),
Minimum = min(pkg, na.rm = TRUE),
Maximum = max(pkg, na.rm = TRUE),
Std_Dev = sd(pkg, na.rm = TRUE),
IQR = IQR(pkg, na.rm = TRUE),
Observations = n()
) %>%
arrange(Crop)
print(stats)
## # A tibble: 8 × 8
## Crop Mean Median Minimum Maximum Std_Dev IQR Observations
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Maize 773. 726. 268. 1722. 266. 438. 946
## 2 Rice 2213. 2200 800 3518. 569. 920. 948
## 3 Sorghum 1263. 1250 420 3250 442. 642. 783
## 4 B.Millet 1355. 1250 469. 4000 515. 785. 632
## 5 F.Millet 1642. 1700 700 3000 345. 450 817
## 6 Wheat 1789. 1800 368. 4050 545. 569. 604
## 7 Beans 2397. 2420 985 3850 561. 952. 944
## 8 Potato 871. 850 300 2000 255. 251. 922
These are correlation plots for pooled sample in two periods: post-harvest (May-Oct) and lean season (Nov-April).
#We'll use mean Monthly data in wide format
prices.monthly
## Region Market Month Year Latitude Longitude mai.price ric.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.00 1530.000
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.00 1650.000
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.50 1592.000
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.00 NaN
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.25 1260.000
## ---
## 950: Mwanza Mwanza 10 2024 -2.51969 32.90144 735.00 2016.667
## 951: Pwani Mnalani 10 2024 -6.44221 38.90622 925.00 1925.000
## 952: Simiyu Bariadi 10 2024 -2.80355 33.98699 667.50 1466.667
## 953: Kigoma Kigoma 10 2024 -4.89697 29.65987 667.00 1750.000
## 954: Rukwa Sumbawanga 10 2024 -7.95239 31.62319 651.25 1916.667
## sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 695.000 761.000 1333.000 793.0 1545.000 745.0000
## 2: 900.000 900.000 1775.000 1230.0 2420.000 730.0000
## 3: 558.000 581.000 1752.000 NaN 2075.000 618.0000
## 4: 437.500 NaN NaN NaN NaN NaN
## 5: 1375.000 1337.500 1637.500 1637.5 1300.000 675.0000
## ---
## 950: 1416.667 1283.333 1583.333 NaN 2800.000 991.6667
## 951: 1525.000 1800.000 2016.667 2000.0 2920.833 1500.0000
## 952: 1420.833 1825.000 1825.000 2500.0 2845.833 1450.0000
## 953: 1000.000 1500.000 2000.000 1700.0 2025.000 900.0000
## 954: NaN NaN 975.000 725.0 2437.500 670.8333
# Create a function to determine the season
get_season <- function(month) {
if (month %in% 5:10) {
return("Post-Harvest")
} else {
return("Lean Season")
}
}
# Add a 'Season' column to the data
prices.monthly$Season <- sapply(prices.monthly$Month, get_season)
# Post Harvest Data
post_harvest_data <- prices.monthly[prices.monthly$Season == "Post-Harvest", ]
# Remove rows with NaNs from the Post-Harvest data
post_harvest_data <- post_harvest_data[complete.cases(post_harvest_data[, 7:14]), ]
# Lean Season data
lean_season_data <- prices.monthly[prices.monthly$Season == "Lean Season", ]
# Remove rows with NaNs from the Lean Season data
lean_season_data <- lean_season_data[complete.cases(lean_season_data[, 7:14]), ]
# Calculate correlation matrix for Post-Harvest season
post_harvest_corr <- cor(post_harvest_data[, 7:14])
# Calculate correlation matrix for Lean Season
lean_season_corr <- cor(lean_season_data[, 7:14])
# Plot correlation matrix for Post-Harvest season
corrplot(post_harvest_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Post-Harvest Correlation Matrix",
line = 3,
cex.main = 0.9)
# Plot correlation matrix for Lean Season
corrplot(lean_season_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Lean Season Correlation Matrix",
line = 3,
cex.main = 0.9)
We are comparing pooled vs Crop Specific Price Prediction models to determine which model performs better at prediction. This is done by comparing their prediction’s r2 or rmse respectively.
Here we are using 2024 data for validation.
# Comparison between Pooled model and Crop Specific Model
set.seed(1983)
evaluate_models <- function(crop) {
# Filter data for the specific crop
crop_data <- mypts[mypts$Crop == crop, ]
training_data_crop <- crop_data[crop_data$Year %in% c(2021, 2022, 2023), ]
training_data_crop <- as.data.frame(training_data_crop)
validation_data_crop <- crop_data[crop_data$Year == 2024, ]
validation_data_crop <- as.data.frame(validation_data_crop)
# Predictions for pooled model on specific crop data
pooled_pred_crop <- predict(rf, newdata = validation_data_crop)
actual_pooled <- validation_data_crop$pkg
result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
# Crop-Specific RF Model
rf_crop <- randomForest(pkg ~ Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag,
data = training_data_crop, na.rm = TRUE)
# Predictions for crop-specific model
predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
actual_crop <- validation_data_crop$pkg
result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
# Calculate performance metrics
rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
return(data.frame(Crop = crop,
Model = c("Pooled", "Crop-Specific"),
RMSE = c(rmse_pooled, rmse_crop),
R_squared = c(r2_pooled, r2_crop)))
}
# Apply the function to all crops
crop <- unique(mypts$Crop)
comparison_df <- do.call(rbind, lapply(crop, evaluate_models))
comparison_df
## Crop Model RMSE R_squared
## 1 Maize Pooled 335.05 0.34
## 2 Maize Crop-Specific 339.85 0.33
## 3 Rice Pooled 557.04 0.32
## 4 Rice Crop-Specific 567.37 0.28
## 5 Sorghum Pooled 318.42 0.53
## 6 Sorghum Crop-Specific 321.36 0.51
## 7 B.Millet Pooled 432.54 0.40
## 8 B.Millet Crop-Specific 432.56 0.40
## 9 F.Millet Pooled 312.51 0.34
## 10 F.Millet Crop-Specific 312.73 0.34
## 11 Wheat Pooled 482.15 0.42
## 12 Wheat Crop-Specific 493.01 0.38
## 13 Beans Pooled 352.01 0.35
## 14 Beans Crop-Specific 364.02 0.34
## 15 Potato Pooled 320.01 0.00
## 16 Potato Crop-Specific 327.89 0.00
#write.csv(comparison_df, "model_comparison_df.csv")
#comparison_df <- read.csv("model_comparison_df.csv")
comparison_df_wide <- comparison_df %>%
pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
rename(
RMSE_Crop_Specific = `RMSE_Crop-Specific`,
RMSE_Pooled = `RMSE_Pooled`,
R_squared_Crop_Specific = `R_squared_Crop-Specific`,
R_squared_Pooled = `R_squared_Pooled`
) %>%
group_by(Crop) %>%
summarize(
RMSE_Pooled = max(RMSE_Pooled, na.rm = TRUE),
RMSE_Crop_Specific = max(RMSE_Crop_Specific, na.rm = TRUE),
R_squared_Pooled = max(R_squared_Pooled, na.rm = TRUE),
R_squared_Crop_Specific = max(R_squared_Crop_Specific, na.rm = TRUE)
)
knitr::kable(comparison_df_wide, caption = "Model Comparison: R² and RMSE")
| Crop | RMSE_Pooled | RMSE_Crop_Specific | R_squared_Pooled | R_squared_Crop_Specific |
|---|---|---|---|---|
| Maize | 335.05 | 339.85 | 0.34 | 0.33 |
| Rice | 557.04 | 567.37 | 0.32 | 0.28 |
| Sorghum | 318.42 | 321.36 | 0.53 | 0.51 |
| B.Millet | 432.54 | 432.56 | 0.40 | 0.40 |
| F.Millet | 312.51 | 312.73 | 0.34 | 0.34 |
| Wheat | 482.15 | 493.01 | 0.42 | 0.38 |
| Beans | 352.01 | 364.02 | 0.35 | 0.34 |
| Potato | 320.01 | 327.89 | 0.00 | 0.00 |
Here the dataset is split once (70:30). The model is trained on one part of the data (70%) and evaluated on the remaining set of the data (30%).
# Pooled RF model
##Split the data into train and test datasets
set.seed(1983)
rows <- sample(x=1:nrow(mypts), size = 0.70* nrow(mypts))
train <- mypts[rows, ]
test <- mypts[! rownames(mypts) %in% rows, ]
RF <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans +
Month +
Year +
ttport_1 +
popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data=train, na.rm=TRUE)
RF
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag, data = train, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 47782.6
## % Var explained: 91.02
evaluate_models <- function(crop) {
# Filter data for the specific crop
crop_data <- mypts[mypts$Crop == crop, ]
set.seed(1983)
rownames(crop_data)<-1:nrow(crop_data)
rows <- sample(x=1:nrow(crop_data), size = 0.70* nrow(crop_data))
training_data_crop <- crop_data[rows, ]
validation_data_crop <- crop_data[! rownames(crop_data) %in% rows, ]
# Predictions for pooled model on specific crop data
pooled_pred_crop <- predict(RF, newdata = validation_data_crop)
actual_pooled <- validation_data_crop$pkg
result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
# Crop-Specific Model
rf_crop <- randomForest(pkg ~ Month + Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag,
data = training_data_crop, na.rm = TRUE)
# Predictions for crop-specific model
predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
actual_crop <- validation_data_crop$pkg
result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
# Calculate performance metrics
rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
return(data.frame(Crop = crop,
Model = c("Pooled", "Crop-Specific"),
RMSE = c(rmse_pooled, rmse_crop),
R_squared = c(r2_pooled, r2_crop)))
}
# Apply function to all crops
crop <- unique(mypts$Crop)
comparison_df3 <- do.call(rbind, lapply(crop, evaluate_models))
comparison_df3
## Crop Model RMSE R_squared
## 1 Maize Pooled 95.80 0.89
## 2 Maize Crop-Specific 113.78 0.83
## 3 Rice Pooled 151.50 0.94
## 4 Rice Crop-Specific 182.37 0.90
## 5 Sorghum Pooled 173.56 0.87
## 6 Sorghum Crop-Specific 209.83 0.77
## 7 B.Millet Pooled 164.25 0.89
## 8 B.Millet Crop-Specific 224.89 0.77
## 9 F.Millet Pooled 155.67 0.80
## 10 F.Millet Crop-Specific 208.40 0.62
## 11 Wheat Pooled 162.06 0.92
## 12 Wheat Crop-Specific 231.81 0.83
## 13 Beans Pooled 173.85 0.92
## 14 Beans Crop-Specific 212.28 0.87
## 15 Potato Pooled 170.21 0.69
## 16 Potato Crop-Specific 137.18 0.73
comparison_df_wide3 <- comparison_df3 %>%
pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
rename(RMSE_Crop_Specific = `RMSE_Crop-Specific`,
RMSE_Pooled = `RMSE_Pooled`,
R_squared_Crop_Specific = `R_squared_Crop-Specific`,
R_squared_Pooled = `R_squared_Pooled`)
knitr::kable(comparison_df_wide3, caption = "Model Comparison: R² and RMSE Based on Train-Test Split Validation")
| Crop | RMSE_Pooled | RMSE_Crop_Specific | R_squared_Pooled | R_squared_Crop_Specific |
|---|---|---|---|---|
| Maize | 95.80 | 113.78 | 0.89 | 0.83 |
| Rice | 151.50 | 182.37 | 0.94 | 0.90 |
| Sorghum | 173.56 | 209.83 | 0.87 | 0.77 |
| B.Millet | 164.25 | 224.89 | 0.89 | 0.77 |
| F.Millet | 155.67 | 208.40 | 0.80 | 0.62 |
| Wheat | 162.06 | 231.81 | 0.92 | 0.83 |
| Beans | 173.85 | 212.28 | 0.92 | 0.87 |
| Potato | 170.21 | 137.18 | 0.69 | 0.73 |
# Predicting For markets
mypts <- as.data.frame(mypts)
# Initialize a list to store results for each market
results <- list()
# Get unique markets
markets <- unique(mypts$Market)
# Loop over each market
for (market in markets) {
# Split data: exclude current market for training, and use current market for validation
train_mkts <- mypts %>% filter(Market != market)
valid_mkt <- mypts %>% filter(Market == market)
# Train Random Forest model on the training data
rf_model <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans +
jan + feb + mar + apr + may + jun + jul + aug + sep + oct + nov + dec +
Year + ttport_1 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data = train_mkts, na.rm = TRUE)
# Predict on the validation set for the current market
predictions <- predict(rf_model, newdata = valid_mkt)
actual <- valid_mkt$pkg
# Calculate RMSE and R-squared for the current market
rmse <- round(sqrt(mean((actual - predictions)^2, na.rm = TRUE)), 2)
r2 <- round(summary(lm(actual ~ predictions))$r.squared, 2)
# Store the results in a data frame for the current market
results[[market]] <- data.frame(Market = market, RMSE = rmse, R_squared = r2)
}
# write.csv(results_df, "Results_R2_RMSE.csv")
results_df <- do.call(rbind, results)
knitr::kable(results_df, caption = "Leave-One-Market-Out Cross-Validation")
| Market | RMSE | R_squared | |
|---|---|---|---|
| Arusha | Arusha | 219.34 | 0.92 |
| Temeke | Temeke | 214.50 | 0.93 |
| Majengo | Majengo | 348.17 | 0.85 |
| Kibaigwa | Kibaigwa | 272.77 | 0.94 |
| Bukoba | Bukoba | 318.05 | 0.78 |
| Babati | Babati | 233.73 | 0.87 |
| Sumbawanga | Sumbawanga | 433.17 | 0.76 |
| Mpanda | Mpanda | 396.63 | 0.76 |
| Mtwara | Mtwara | 482.25 | 0.68 |
| Tabora | Tabora | 217.31 | 0.90 |
| Tanga | Tanga | 295.37 | 0.81 |
| Kinondoni | Kinondoni | 196.30 | 0.93 |
| Ilala | Ilala | 247.43 | 0.93 |
| Iringa | Iringa | 388.58 | 0.74 |
| Kigoma | Kigoma | 316.41 | 0.78 |
| Morogoro | Morogoro | 260.77 | 0.86 |
| Mwanza | Mwanza | 307.11 | 0.81 |
| Musoma | Musoma | 420.12 | 0.55 |
| Songea | Songea | 422.90 | 0.83 |
| Shinyanga | Shinyanga | 389.33 | 0.74 |
| Moshi | Moshi | 363.48 | 0.84 |
| Mbeya | Mbeya | 307.84 | 0.85 |
| Njombe | Njombe | 352.91 | 0.84 |
| Lindi | Lindi | 727.47 | 0.57 |
| Manyara | Manyara | 327.49 | 0.79 |
| Tandika | Tandika | 216.96 | 0.95 |
| Buguruni | Buguruni | 154.29 | 0.99 |
| Tandale | Tandale | 218.89 | 0.95 |
| Singida | Singida | 219.75 | 0.97 |
| Pwani | Pwani | 349.67 | 0.75 |
| Bariadi | Bariadi | 248.45 | 0.85 |
| Mpimbwe | Mpimbwe | 457.23 | 0.77 |
| Nyankumbu | Nyankumbu | 428.87 | 0.71 |
| Songwe | Songwe | 500.93 | 0.50 |
| Mwananyamala | Mwananyamala | 226.61 | 0.93 |
| Namfua | Namfua | 168.57 | 0.95 |
| Kilombero | Kilombero | 203.79 | 0.92 |
| Mgandini | Mgandini | 356.93 | 0.62 |
| majengo | majengo | 245.12 | 0.92 |
| Igawilo | Igawilo | 291.08 | 0.83 |
| Mnalani | Mnalani | 139.41 | 0.95 |
| Ubungo | Ubungo | 285.63 | 0.97 |
# Calculate the mean and standard deviation for RMSE and R-squared
mean_rmse <- mean(results_df$RMSE)
sd_rmse <- sd(results_df$RMSE)
mean_r2 <- mean(results_df$R_squared)
sd_r2 <- sd(results_df$R_squared)
summary_table <- data.frame(
Metric = c("RMSE", "R-squared"),
Mean = c(mean_rmse, mean_r2),
SD = c(sd_rmse, sd_r2)
)
knitr::kable(summary_table, caption = "mean and standard deviation for RMSE and R-squared of LOMO-CV")
| Metric | Mean | SD |
|---|---|---|
| RMSE | 313.6097619 | 113.0821921 |
| R-squared | 0.8283333 | 0.1196319 |
results_df <- read.csv("Results_R2_RMSE.csv")
# Convert to spatial data
cods_vect <- terra::vect(results_df, geom = c("Longitude", "Latitude"), crs = crs(tza1), keepgeom=FALSE)
cods_sf <- sf::st_as_sf(cods_vect)
tza1_sf <- sf::st_as_sf(tza1)
# Plot R^2 values on the Tanzania map
ggplot(data = tza1_sf) +
# Base map with Tanzania's boundaries
geom_sf(fill = "white", color = "gray") +
# Add market-level points colored and sized by R-squared values
geom_sf(data = cods_sf, aes(size = R_squared, color = R_squared), alpha = 0.8) +
scale_color_viridis_c(option = "C", name = "R-squared") +
scale_size_continuous(range = c(2, 10), name = "R-squared") +
guides(
size = guide_legend(order = 1),
color = guide_legend(order = 1)
) +
labs(title = "Market-Level R-squared Values",
x = "Longitude", y = "Latitude") +
theme_minimal()
The OSM dataset used was obtained from Tanzania Populated Places (OpenStreetMap Export) - https://data.humdata.org/dataset/hotosm_tza_populated_places?
# Extract Predicted price using OSM Data
# Bring in Tanzania Populated Places from OSM
TZ_populated_places <- vect("H:\\Tanzania Price data\\Datasets\\OSM_Population\\hotosm_tza_populated_places_points_shp.shp")
# head(TZ_populated_places)
# Plot All data points
plot(tza1)
plot(TZ_populated_places, col="Red", add=TRUE)
sapply(TZ_populated_places, class)
## name name_en place population is_in source
## "character" "character" "character" "character" "character" "character"
## name_sw osm_id osm_type
## "character" "integer" "character"
# Ensure that the population column is numeric
TZ_populated_places$population <- as.numeric(TZ_populated_places$population)
# Filter to remain with rows where population is 20,000 or more
TZ_populated_places_filtered <- TZ_populated_places_filtered <- TZ_populated_places[!is.na(TZ_populated_places$population) & TZ_populated_places$population >= 20000, ]
# Plot places where population data is provided
plot(tza1)
plot(TZ_populated_places_filtered, col="Red", add=TRUE)
# Read the predicted price raster data for Maize
output_dir_Maize <- "H:/Tanzania Price data/Datasets/Pred-plots/Maize"
Maize_price_list <- list.files(output_dir_Maize, pattern = ".tif$", full.names = TRUE)
Maize_price_list
## [1] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_01.tif"
## [2] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_02.tif"
## [3] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_03.tif"
## [4] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_04.tif"
## [5] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_05.tif"
## [6] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_06.tif"
## [7] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_07.tif"
## [8] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_08.tif"
## [9] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_09.tif"
## [10] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_10.tif"
Maize_price_rast <- rast(Maize_price_list)
# Extract predicted maize prices for OSM points
osm_points_with_prices <- TZ_populated_places_filtered
# Loop over each month and extract the maize price for OSM points
for (i in 1:nlyr(Maize_price_rast)) {
# Extract prices for month i
prices_for_month <- terra::extract(Maize_price_rast[[i]], TZ_populated_places_filtered, method = "bilinear")
# Add the extracted prices to the SpatVector (each month as a separate column)
osm_points_with_prices[[paste0("maize_price_month_", sprintf("%02d", i))]] <- prices_for_month[,2]
}
# Filter to remain only with necessary columns
osm_points_with_prices_select <- osm_points_with_prices[, c("name",
"place", "is_in", "population", "osm_id", "maize_price_month_01", "maize_price_month_02", "maize_price_month_03", "maize_price_month_04",
"maize_price_month_05", "maize_price_month_06", "maize_price_month_07",
"maize_price_month_08")]
osm_points_with_prices_select <- as.data.frame(osm_points_with_prices_select)
# Reshape to long format with a Month and price columns
osm_points_long <- osm_points_with_prices_select %>%
pivot_longer(cols = starts_with("maize_price_month_"),
names_to = "Month",
values_to = "price")
# Clean month column to extract only the month number
osm_points_long$Month <- as.numeric(gsub("maize_price_month_", "", osm_points_long$Month))
# Add a new column crop with Maize
osm_points_long$crop <- "maize"
osm_points_long <- as.data.table(osm_points_long)
ppts <- head(osm_points_long, 50)
knitr::kable(ppts, caption = "Extracted Maize Prices For Different Markets in Tanzania For the Year 2024")
| name | place | is_in | population | osm_id | Month | price | crop |
|---|---|---|---|---|---|---|---|
| Geita | city | NA | 318006 | 258307668 | 1 | 1071.4069 | maize |
| Geita | city | NA | 318006 | 258307668 | 2 | 1040.3415 | maize |
| Geita | city | NA | 318006 | 258307668 | 3 | 1032.7237 | maize |
| Geita | city | NA | 318006 | 258307668 | 4 | 968.8754 | maize |
| Geita | city | NA | 318006 | 258307668 | 5 | 978.2965 | maize |
| Geita | city | NA | 318006 | 258307668 | 6 | 995.3333 | maize |
| Geita | city | NA | 318006 | 258307668 | 7 | 994.3241 | maize |
| Geita | city | NA | 318006 | 258307668 | 8 | 984.8319 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 1 | 954.2232 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 2 | 916.2251 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 3 | 895.5600 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 4 | 841.1950 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 5 | 810.0676 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 6 | 807.7249 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 7 | 800.8484 | maize |
| Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 8 | 802.6259 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 1 | 1097.4390 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 2 | 1077.4080 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 3 | 1058.0266 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 4 | 954.7785 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 5 | 952.6296 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 6 | 973.3632 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 7 | 977.1048 | maize |
| Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 8 | 978.0525 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 1 | 964.7503 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 2 | 951.0346 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 3 | 942.9071 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 4 | 913.3199 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 5 | 872.9376 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 6 | 849.7830 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 7 | 818.5362 | maize |
| Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 8 | 828.0533 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 1 | 1080.6272 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 2 | 1089.0785 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 3 | 1068.1354 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 4 | 1026.9848 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 5 | 1084.0845 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 6 | 1081.5926 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 7 | 1079.0308 | maize |
| Chake-Chake | town | NA | 52047 | 622514835 | 8 | 1055.8905 | maize |
| Babati | town | NA | 67445 | -1047512940 | 1 | 953.8866 | maize |
| Babati | town | NA | 67445 | -1047512940 | 2 | 958.2093 | maize |
| Babati | town | NA | 67445 | -1047512940 | 3 | 959.7328 | maize |
| Babati | town | NA | 67445 | -1047512940 | 4 | 939.8418 | maize |
| Babati | town | NA | 67445 | -1047512940 | 5 | 915.3608 | maize |
| Babati | town | NA | 67445 | -1047512940 | 6 | 915.8868 | maize |
| Babati | town | NA | 67445 | -1047512940 | 7 | 915.1750 | maize |
| Babati | town | NA | 67445 | -1047512940 | 8 | 913.8773 | maize |
| Vikindu | town | NA | 70000 | 1826809646 | 1 | 1093.1816 | maize |
| Vikindu | town | NA | 70000 | 1826809646 | 2 | 1083.0636 | maize |
# Slope from geodata
# Extract slope from srtm using terrain function in terra package
# tza_alt <- elevation_30s("Tanzania", path=".")
# Obtain the slope in radians
# slope_radian <- terrain(tza_alt, "slope", unit="radians", neighbors=8)
# terra::plot(slope_radian, main = "Slope of Tanzania (Radians)")
# Save slope raster
# output_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data"
# output_file <- paste0(output_dir, "/tz_slope_radian.tif")
#writeRaster(slope_radian, output_file, overwrite = TRUE)
# Set Lambert Azimuthal Equal-Area (LAEA) projection centered on Tanzania
laea_tz <- "+proj=laea +lat_0=-6 +lon_0=35 +datum=WGS84 +units=m +no_defs"
# Load and reproject slope layer
slope_radian <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/tz_slope_radian.tif")
# Reproject to LAEA and set resolution to 500m
slope_radian_laea <- terra::project(slope_radian, laea_tz, res = 500,
filename = "slope_laea_500m.tif", overwrite = TRUE)
terra::plot(slope_radian_laea, main = "Slope of Tanzania (Radians)")
# Use the slope layer obtained above to create a decay coefficient
# We use a decay coefficient of 1.5
# Create slope cost surface
decay <- 1.5
slope_cost <- exp(decay * tan(slope_radian_laea))
names(slope_cost) <- "slope_cost"
terra::plot(slope_cost, main = "Slope cost")
# Roads Data from OSM
# Obtain roads data from osmdata. Code is in reagro.
# roads <- geodata::osm("Tanzania", "highways", ".")
# writeVector(roads, "H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp", overwrite = TRUE)
# Load and reproject roads
roads <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp")
# Reproject the roads vector to LAEA
roads_laea <- terra::project(roads, laea_tz)
terra::plot(slope_radian_laea)
terra::lines(roads_laea, col="black")
terra::lines(roads_laea[roads_laea$highway == "primary", ], lwd=4, col="red")
terra::lines(roads_laea[roads_laea$highway == "secondary", ], lwd=2, col="blue")
# Create road cost surface
# Create road cost surface
cfile <- "rdcost_laea.tif"
roadtypes <- c("primary", "secondary", "tertiary")
if (!file.exists(cfile)) {
i <- match(roads_laea$highway, roadtypes)
roads_laea$speed <- c(0.001, 0.0015, 0.002)[i]
rd_cost <- rasterize(roads_laea, slope_radian, field=roads_laea$speed, filename=cfile, overwrite=TRUE)
} else {
rd_cost <- terra::rast(cfile)
}
rd_cost_laea <- terra::project(rd_cost, laea_tz, res = 500,
filename = "rd_cost_laea_500m.tif", overwrite = TRUE)
a <- aggregate(rd_cost_laea, 3, min, na.rm=TRUE)
terra::plot(a, col=c("black", "blue", "red"), main = "Road travel cost (min/m)")
# Load land cover layer
tza_lulc <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/TZ_Land_cover_2021.tif")
plot(tza_lulc, main = "WorldCover 10 m 2021 v200 - landcover classes")
terra::lines(tza1)
Table_class <- data.frame(
Value = c(10,20,30,40,50,60,70,80,90, 95),
LandClass = c("Tree cover",
"Shrubland",
"Grassland",
"Cropland",
"Built-up",
"Bare/sparse vegetation",
"Snow and Ice",
"Permanent water bodies",
"Herbaceous wetland",
"Mangroves"),
travel_speeds=c(0.04, 0.02, 0.02, 0.01, 0.01, 0.02, 0.04, 0.11, 0.04, 0.05)
)
knitr::kable(Table_class, caption = "Land Cover classes (ESA WorldCover 10 m 2021 v200)")
| Value | LandClass | travel_speeds |
|---|---|---|
| 10 | Tree cover | 0.04 |
| 20 | Shrubland | 0.02 |
| 30 | Grassland | 0.02 |
| 40 | Cropland | 0.01 |
| 50 | Built-up | 0.01 |
| 60 | Bare/sparse vegetation | 0.02 |
| 70 | Snow and Ice | 0.04 |
| 80 | Permanent water bodies | 0.11 |
| 90 | Herbaceous wetland | 0.04 |
| 95 | Mangroves | 0.05 |
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
# Adjust Reclassification Table Based on the Classes
#rc <- data.frame(from = Table_class$Value, to = Table_class$travel_speeds)
# Assign specific travel speeds based on land cover class values
rc$to[rc$from %in% c(10)] <- 0.04 # Tree cover
rc$to[rc$from %in% c(20, 30)] <- 0.02 # Shrubland and Grassland
rc$to[rc$from == 40] <- 0.01 # Cropland
rc$to[rc$from == 50] <- 0.01 # Built-up
rc$to[rc$from == 60] <- 0.02 # Bare/sparse vegetation
rc$to[rc$from == 70] <- 0.04 # Snow and Ice
rc$to[rc$from == 80] <- 0.11 # Permanent water bodies
rc$to[rc$from == 90] <- 0.04 # Herbaceous wetland
rc$to[rc$from == 95] <- 0.05 # Mangroves
rc
## from to
## 1 10 0.04
## 2 20 0.02
## 3 30 0.02
## 4 40 0.01
## 5 50 0.01
## 6 60 0.02
## 7 70 0.04
## 8 80 0.11
## 9 90 0.04
## 10 95 0.05
#reclassifying
tza_lc_reclass <- classify(tza_lulc, rc)
##
|---------|---------|---------|---------|
=========================================
lcfname <- "lc_cost.tif"
if (!file.exists(lcfname)) {
# first aggregate to about the same spatial resolution
lc_cost <- aggregate(tza_lc_reclass, 3, mean)
# then resample
lc_cost <- resample(lc_cost, slope_radian, filename=lcfname, wopt=list(names="lc_cost"), overwrite=TRUE)
} else {
lc_cost <- rast(lcfname)
}
lc_cost_laea <- terra::project(lc_cost, laea_tz, res = 500,
filename = "lc_cost_laea_500m.tif", overwrite = TRUE)
terra::plot(lc_cost_laea, main = "Off-road travel costs (min/m) based on land cover class")
# Resample to harmonize the resolution
#lc_cost_resampled <- terra::resample(lc_cost, slope_cost)
#names(lc_cost_resampled) <- "lc_cost"
#rd_cost_resample <- terra::resample(rd_cost, slope_cost)
#names(rd_cost_resample) <- "rd_cost"
# Combine the cost layers
all_cost <- c(rd_cost_laea, lc_cost_laea)
#getting the minimum value of each grid cell
cost <- min(all_cost, na.rm=TRUE)
cost <- cost * slope_cost
terra::plot(cost, main="Final cost layer (min/m)")
# Combine the cost layers
library(gdistance)
cost <- raster(cost)
conductance <- 1/cost
#Creating a transition object
tran <- transition(conductance, transitionFunction=mean, directions= 8)
tran <- geoCorrection(tran, type="c")
save(trans, file = "H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
# Towns of 20,000 people or more from OSM
town <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")
# Project the towns to match the CRS of slope_radian
town_prj <- project(town, crs(slope_cost))
# geom(town_prj)
# Extracting coordinates and population from the projected towns
towns <- data.frame(
x = geom(town_prj)[, 3],
y = geom(town_prj)[, 4],
population = town_prj$population # Population from the projected towns
)
# towns
#convert to spatial points needed in gdistance
spTowns <- SpatialPoints(cbind(towns$x, towns$y))
spTowns
## class : SpatialPoints
## features : 94
## extent : -584189.9, 567514.8, -558136.2, 532149.3 (xmin, xmax, ymin, ymax)
## crs : NA
#Estimating
Ac <- accCost(tran, fromCoords=spTowns)
A <- rast(Ac) / 60
AA <- clamp(A, 0, 24) |> mask(slope_radian_laea)
## Warning: [mask] CRS do not match
terra::plot(AA, main="Access to markets (towns > 20k) in Tanzania (hrs)")
terra::lines(roads_laea)
terra::points(town_prj, col="red", pch=20, cex=1.0)
#Load trans object that was created
load("H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
terra::plot(raster(tran), main='Transition matrix (minutes)')
terra::plot(town_prj, col = "red", cex = 0.5, add=TRUE)
# We'll use towns our >20k from OSM
town1 <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")
# Extract predicted maize prices for OSM points
# Lets use January Maize price for now
maize_price_jan <- rast("H:\\Tanzania Price data\\Datasets\\Pred-plots\\Maize\\maize_price_rf_pred_01.tif")
maize_price_jan_laea <- terra::project(maize_price_jan, laea_tz, res = 500)
maize_price_jan_osm <- terra::extract(maize_price_jan_laea, town_prj, fun=mean, buffer=2500, small=TRUE)
# the extract function is returning some NA values, complete the list with a mean of the other values
price_values <- maize_price_jan_osm[, 2]
# Replace NAs with the mean of the non-NA values
maize_price_jan_osm[, 2][is.na(price_values)] <- mean(price_values, na.rm = TRUE)
head(maize_price_jan_osm)
## ID lyr1
## 1 1 1071.4312
## 2 2 954.1059
## 3 3 1096.9810
## 4 4 964.8270
## 5 5 1079.5992
## 6 6 954.1497
library(sp)
town_prj[["maipkg"]] <- maize_price_jan_osm$lyr1
town_sp <- as(town_prj, "Spatial")
#transportation cost. I converted 0.02 usd/kg/h to TZS/kg/hr = 54.43
tr_cost <- 54.43 #TZS/kg/hr
# Folder to store temporary rasters
temp_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data/Temp_fgate_rasters/"
if (!dir.exists(temp_dir)) {
dir.create(temp_dir, recursive = TRUE)
}
# Loop to calculate and save each farmgate price raster
for (x in seq_along(town_sp)) {
tmp.acc <- accCost(tran, town_sp[x,]) / 60
market_price <- as.data.frame(town_sp)[x, "maipkg"]
# Calculate farmgate price
fgate_price <- market_price - (54.43 * tmp.acc)
fgate_price[fgate_price < 0] <- 0
# Save each raster to disk with a unique filename
save_path <- paste0(temp_dir, "fgate_price_", x, ".tif")
terra::writeRaster(fgate_price, save_path, overwrite = TRUE)
}
# Read all saved rasters back into a list
raster_files <- list.files(temp_dir, pattern = "\\.tif$", full.names = TRUE)
fgate_rasters <- lapply(raster_files, terra::rast)
# Combine all rasters into a single SpatRaster stack
mystack_spat <- do.call(c, fgate_rasters)
fgate_price <- max(mystack_spat) %>% resample(.,maize_price_jan_laea) %>% mask(.,maize_price_jan_laea)
##
|---------|---------|---------|---------|
=========================================
maize_price_jan_laea; fgate_price
## class : SpatRaster
## dimensions : 2669, 2669, 1 (nrow, ncol, nlyr)
## resolution : 500, 500 (x, y)
## extent : -668509.2, 665990.8, -671388, 663112 (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs
## source(s) : memory
## name : lyr1
## min value : 864.5159
## max value : 1364.5247
## class : SpatRaster
## dimensions : 2669, 2669, 1 (nrow, ncol, nlyr)
## resolution : 500, 500 (x, y)
## extent : -668509.2, 665990.8, -671388, 663112 (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs
## source(s) : memory
## name : max
## min value : 0.000
## max value : 1190.293
terra::compareGeom(maize_price_jan_laea, fgate_price)
## [1] TRUE
names(fgate_price) <- "Farm_Gate_Price"
plot(fgate_price, main="Farm gate prices Jan 2024")
names(maize_price_jan_laea) <- "Pred_Maize_price_jan"
maize_values_jan <- terra::values(maize_price_jan_laea, na.rm = TRUE)
# Get minimum and maximum values
min_maize_jan <- min(maize_values_jan, na.rm = TRUE)
max_maize_jan <- max(maize_values_jan, na.rm = TRUE)
# Define the break interval for plotting
break_interval <- 100
breaks_seq <- seq(min_maize_jan, max_maize_jan, by = break_interval)
color_palette <- rev(terrain.colors(100))
# Plot
terra::plot(
maize_price_jan_laea,
main = "Predicted Maize Price Jan 2024",
zlim = c(min_maize_jan, max_maize_jan),
col = color_palette,
breaks = breaks_seq
)
# Set up the layout for 2 plots side by side
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
# Plot Maize Price
plot(
maize_price_jan_laea,
main = "Predicted Maize Price Jan 2024",
zlim = c(min_maize_jan, max_maize_jan),
col = color_palette,
breaks = breaks_seq
)
# Plot Farm Gate Price
plot(fgate_price, main="Farm gate prices Jan 2024")